# -*- coding: utf-8 -*-  
""" 
Created on Tue Oct 17 22:50:20 2017 
 
@author: sirius
"""
import requests
import urllib
import re
from bs4 import BeautifulSoup as bs


def clean_app_name(app_name):
    """数据清洗"""
    space = u'\u00a0'
    app_name = app_name.replace(space, '')
    brackets = r'\(.*\)|\[.*\]|[.*]|（.*）'
    return re.sub(brackets, '', app_name)


def spider(search):
    """爬取数据"""
    # 名称可能不规整，进行数据清洗
    search = clean_app_name(search)
    # 处理URL，拼接搜索结果页面URL
    URL = 'http://www.wandoujia.com/search?key=%s' % (urllib.parse.quote(search.encode("utf8")))
    re = requests.get(URL).text
    soup = bs(re, 'lxml')
    result = soup.find_all('div')
    if len(result[11]) > 3:
        down_num = result[11].text[:-3]
    else:
        down_num = 1
    # print type('亿')
    # print type(down_num[-1])
    # 豌豆荚搜索结果为空，增强系统的鲁棒性
    try:
        if down_num[-1] == '亿':
            number = float(down_num[:-1]) * 100000000
        elif down_num[-1] == '万':
            number = float(down_num[:-1]) * 10000
        else:
            number = float(down_num)
    except Exception as e:
        return 1

    return number


def get_pkg_url(pkg):
    """get the detail url according to pkg"""

    return 'http://www.wandoujia.com/apps/%s' % pkg


downloads = spider('Telephone')
# print(downloads)
